library(ggplot2)
library(stringi)
library(gridExtra)
library(dendextend)
library(kableExtra)
library(limma)
library(psych)
library(tidyverse)
library(CONSTANd)  # install from source: https://github.com/PDiracDelta/CONSTANd/
source('./other_functions.R')
source('./plotting_functions.R')

This notebook presents isobaric labeling data analysis strategy that includes data-driven normalization.

In other notebooks in this series we have systematically varied components and observed how they affect the outcome of a DEA analysis. We have seen that medianSweeping normalization works does not work well for intensities on the original scale, and that CONSTANd does not work well on log2-transformed intensities. Here we compare medianSweeping on log2 scale, which we know does a good job, with CONSTANd on original intensity scale.

data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format
# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character
#TEMP -remove!
# tmp=dat.l %>% distinct(Protein) %>% pull %>% as.character
# dat.l <- dat.l %>% filter(Protein %in% c(tmp[sample(1:length(tmp), size=20)], spiked.proteins))
# specify # of varying component variants and their names
variant.names <- c('medianSweeping', 'CONSTANd')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'raw')
# pick reference condition for making plots / doing DEA
referenceCondition <- '0.5'
# specify colours corresponding to biological conditions
condition.color <- tribble(
  ~Condition, ~Color,
  "0.125", 'black',
  "0.5", 'blue',
  "0.667", 'green',
  "1", 'red' )
# create data frame with sample info (distinct Run,Channel, Sample, Condition, Color)
sample.info <- get_sample_info(dat.l, condition.color)
channelNames <- remove_factors(unique(sample.info$Channel))

1 Unit scale component

dat.unit.l <- emptyList(variant.names)

1.1 medianSweeping: log2 intensity

dat.unit.l$medianSweeping <- dat.l %>% mutate(response=log2(Intensity)) %>% select(-Intensity)

1.2 CONSTANd: original intensity

dat.unit.l$CONSTANd <- dat.l %>% rename(response=Intensity)

2 Normalization component

CONSTANd vs medianSweeping (in 2 steps)

# switch to wide format
dat.unit.w <- lapply(dat.unit.l, function(x) {
  pivot_wider(data = x, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=Channel, values_from=response)
})
dat.norm.w <- emptyList(names(dat.unit.w))

2.1 medianSweeping (1)

# subtract the spectrum median log2intensity from the observed log2intensities
dat.norm.w$medianSweeping <- dat.unit.w$medianSweeping
dat.norm.w$medianSweeping[,channelNames] <- dat.norm.w$medianSweeping[,channelNames] %>% sweep(1, apply(.[,channelNames], 1, median, na.rm=T))
dat.norm.w$medianSweeping
## # A tibble: 94,698 x 23
##    Mixture TechRepMixture Run   Protein Peptide    RT Charge PTM  
##    <fct>   <fct>          <fct> <fct>   <fct>   <dbl> <fct>  <fct>
##  1 Mixtur… 1              Mixt… Q9NQC3  [R].hQ…  101. 4      N-Te…
##  2 Mixtur… 1              Mixt… P05141  [K].dF…  167. 3      N-Te…
##  3 Mixtur… 1              Mixt… P49189  [K].tV…  183. 2      N-Te…
##  4 Mixtur… 1              Mixt… P62263  [R].iE…  133. 2      N-Te…
##  5 Mixtur… 1              Mixt… P21291  [K].gF…  145. 2      N-Te…
##  6 Mixtur… 1              Mixt… P30511  [K].wA…  137. 2      N-Te…
##  7 Mixtur… 1              Mixt… Q13185  [R].lT…  106. 2      N-Te…
##  8 Mixtur… 1              Mixt… P02787… [K].sA…  149. 3      N-Te…
##  9 Mixtur… 1              Mixt… Q9UBQ5  [K].iD…  202. 2      N-Te…
## 10 Mixtur… 1              Mixt… Q5H9R7  [R].tG…  107. 2      N-Te…
## # … with 94,688 more rows, and 15 more variables: TotalIntensity <dbl>,
## #   Ions.Score <int>, DeltaMZ <dbl>, isoInterOk <lgl>, noNAs <lgl>,
## #   onehit.protein <lgl>, shared.peptide <lgl>, `127C` <dbl>, `127N` <dbl>,
## #   `128C` <dbl>, `128N` <dbl>, `129C` <dbl>, `129N` <dbl>, `130C` <dbl>,
## #   `130N` <dbl>

2.2 CONSTANd

Now let’s apply CONSTANd to each Run separately, and then combine the results into a semi-wide dataframe again.

# dat.unit.l entries are in long format so all have same colnames and no channelNames
x.split <- split(dat.unit.w$CONSTANd, dat.unit.w$CONSTANd$Run)  # apply CONSTANd to each Run separately
x.split.norm  <- lapply(x.split, function(y) {
  y[,channelNames] <- CONSTANd(y[,channelNames])$normalized_data
  return(y)
})
dat.norm.w$CONSTANd <- bind_rows(x.split.norm)
dat.norm.w$CONSTANd
## # A tibble: 94,698 x 23
##    Mixture TechRepMixture Run   Protein Peptide    RT Charge PTM  
##    <fct>   <fct>          <fct> <fct>   <fct>   <dbl> <fct>  <fct>
##  1 Mixtur… 1              Mixt… Q99447  [K].rT…  61.6 3      N-Te…
##  2 Mixtur… 1              Mixt… Q86X55  [R].lL… 142.  2      N-Te…
##  3 Mixtur… 1              Mixt… Q96A33  [K].iM… 115.  2      N-Te…
##  4 Mixtur… 1              Mixt… Q9Y3D7  [K].sV… 129.  2      N-Te…
##  5 Mixtur… 1              Mixt… P18085  [R].iQ… 146.  3      N-Te…
##  6 Mixtur… 1              Mixt… P14868  [R].vT… 160.  3      N-Te…
##  7 Mixtur… 1              Mixt… Q13813  [K].dL… 119.  2      N-Te…
##  8 Mixtur… 1              Mixt… P49736  [R].gL… 177.  3      N-Te…
##  9 Mixtur… 1              Mixt… Q96T88  [K].lW… 196.  3      N-Te…
## 10 Mixtur… 1              Mixt… P20700  [K].sM…  96.2 2      N-Te…
## # … with 94,688 more rows, and 15 more variables: TotalIntensity <dbl>,
## #   Ions.Score <int>, DeltaMZ <dbl>, isoInterOk <lgl>, noNAs <lgl>,
## #   onehit.protein <lgl>, shared.peptide <lgl>, `127C` <dbl>, `127N` <dbl>,
## #   `128C` <dbl>, `128N` <dbl>, `129C` <dbl>, `129N` <dbl>, `130C` <dbl>,
## #   `130N` <dbl>

3 Summarization component: Median summarization

Summarize quantification values from PSM to peptide (first step) to protein (second step).

# normalized data
# group by (run,)protein,peptide then summarize twice (once on each level)
dat.norm.summ.w <- lapply(dat.norm.w, function(x) x %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% ungroup() )

Notice that the row sums are not equal to Ncols anymore, because the median summarization does not preserve them (but mean summarization does).

4 Normalization component: medianSweeping (2)

# medianSweeping: in each channel, subtract median computed across all proteins within the channel
# do the above separately for each MS run
x.split <- split(dat.norm.summ.w$medianSweeping, dat.norm.summ.w$medianSweeping$Run)
x.split.norm  <- lapply( x.split, function(y) {
  y[,channelNames] <- sweep(y[,channelNames], 2, apply(y[,channelNames], 2, median, na.rm=T) )
  return(y) } )
dat.norm.summ.w$medianSweeping <- bind_rows(x.split.norm)

5 QC plots

# make data completely wide (also across runs)
dat.norm.summ.w2 <- lapply( dat.norm.summ.w, function(x) x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}") )

5.1 Boxplots

# use (half-)wide format
par(mfrow=c(1,2))
for (i in seq_along(variant.names)) {
  boxplot_w(dat.norm.summ.w[[variant.names[i]]], sample.info, paste('Normalized', variant.names[i], sep='_'))}

5.2 MA plots

MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).

# use wide2 format
p <- emptyList(variant.names)
for (i in 1: n.comp.variants){
  p[[i]] <- maplot_ils(dat.norm.summ.w2[[i]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Normalized', variant.names[i], sep='_'), spiked.proteins)}
grid.arrange(p[[1]], p[[2]], ncol=2)

MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).

channels.num <- sample.info %>% filter(Condition=='1') %>% select(Sample) %>% pull
channels.denom <- sample.info %>% filter(Condition=='0.125') %>% select(Sample) %>% pull
p <- emptyList(variant.names)
for (i in 1:n.comp.variants){
  p[[i]] <- maplot_ils(dat.norm.summ.w2[[i]], channels.num, channels.denom, scale=scale.vec[i], paste('Normalized', variant.names[i], sep='_'), spiked.proteins)}
grid.arrange(p[[1]], p[[2]], ncol=2)

5.3 PCA plots

5.3.1 Using all proteins

par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
  pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))}

5.3.2 Using spiked proteins only

par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
  pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))}

5.4 HC (hierarchical clustering) plots

Only use spiked proteins

par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
  dendrogram_ils(dat.norm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-Protein), info=sample.info, paste('Normalized', variant.names[i], sep='_'))}

6 DEA component: Moderated t-test

NOTE: - actually, lmFit (used in moderated_ttest) was built for log2-transformed data. However, supplying untransformed intensities can also work. This just means that the effects in the linear model are also additive on the untransformed scale, whereas for log-transformed data they are multiplicative on the untransformed scale. Also, there may be a bias which occurs from biased estimates of the population means in the t-tests, as mean(X) is not equal to exp(mean(log(X))).

design.matrix <- get_design_matrix(referenceCondition, sample.info)
dat.dea <- emptyList(names(dat.norm.summ.w2))
for (i in seq_along(dat.norm.summ.w2)) {
  this_scale <- scale.vec[match(names(dat.dea)[i], variant.names)]
  d <- column_to_rownames(as.data.frame(dat.norm.summ.w2[[variant.names[i]]]), 'Protein')
  dat.dea[[variant.names[i]]] <- moderated_ttest(dat=d, design.matrix, scale=this_scale)}

7 Results comparison

7.1 Confusion matrix

cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm, referenceCondition)
0.125 vs 0.5 contrast
medianSweeping
CONSTANd
background spiked background spiked
not_DE 4732 4 4735 5
DE 6 17 3 16
0.125 vs 0.5 contrast
medianSweeping CONSTANd
Accuracy 0.998 0.998
Sensitivity 0.810 0.762
Specificity 0.999 0.999
PPV 0.739 0.842
NPV 0.999 0.999
0.667 vs 0.5 contrast
medianSweeping
CONSTANd
background spiked background spiked
not_DE 4736 17 4736 15
DE 2 4 2 6
0.667 vs 0.5 contrast
medianSweeping CONSTANd
Accuracy 0.996 0.996
Sensitivity 0.190 0.286
Specificity 1.000 1.000
PPV 0.667 0.750
NPV 0.996 0.997
1 vs 0.5 contrast
medianSweeping
CONSTANd
background spiked background spiked
not_DE 4732 5 4731 3
DE 2 16 3 18
1 vs 0.5 contrast
medianSweeping CONSTANd
Accuracy 0.999 0.999
Sensitivity 0.762 0.857
Specificity 1.000 0.999
PPV 0.889 0.857
NPV 0.999 0.999

7.2 Scatter plots

# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
significance.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)

scatterplot_ils(dat.dea, significance.cols, 'q-values', spiked.proteins)

scatterplot_ils(dat.dea, logFC.cols, 'log2FC', spiked.proteins)

7.3 Volcano plots

for (i in 1:n.contrasts){
  volcanoplot_ils(dat.dea, i, spiked.proteins)}

7.4 Violin plots

Let’s see whether the spiked protein fold changes make sense

# plot theoretical value (horizontal lines) and violin per variant
violinplot_ils(lapply(dat.dea, function(x) x[spiked.proteins, logFC.cols]), referenceCondition)

8 Conclusions

9 Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_BE.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_BE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_BE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_BE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] CONSTANd_0.99.4   forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2      
##  [5] purrr_0.3.4       readr_1.4.0       tidyr_1.1.2       tibble_3.0.4     
##  [9] tidyverse_1.3.0   psych_2.0.9       limma_3.45.19     kableExtra_1.3.1 
## [13] dendextend_1.14.0 gridExtra_2.3     stringi_1.5.3     ggplot2_3.3.2    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-150         fs_1.5.0             lubridate_1.7.9     
##  [4] webshot_0.5.2        httr_1.4.2           tools_4.0.3         
##  [7] backports_1.1.10     utf8_1.1.4           R6_2.4.1            
## [10] rpart_4.1-15         DBI_1.1.0            mgcv_1.8-33         
## [13] colorspace_1.4-1     nnet_7.3-14          withr_2.3.0         
## [16] tidyselect_1.1.0     mnormt_2.0.2         compiler_4.0.3      
## [19] cli_2.1.0            rvest_0.3.6          xml2_1.3.2          
## [22] labeling_0.4.2       scales_1.1.1         digest_0.6.27       
## [25] rmarkdown_2.5        pkgconfig_2.0.3      htmltools_0.5.0     
## [28] highr_0.8            dbplyr_1.4.4         rlang_0.4.8         
## [31] readxl_1.3.1         rstudioapi_0.11      farver_2.0.3        
## [34] generics_0.0.2       jsonlite_1.7.1       ModelMetrics_1.2.2.2
## [37] magrittr_1.5         Matrix_1.2-18        Rcpp_1.0.5          
## [40] munsell_0.5.0        fansi_0.4.1          viridis_0.5.1       
## [43] lifecycle_0.2.0      pROC_1.16.2          yaml_2.2.1          
## [46] MASS_7.3-53          plyr_1.8.6           recipes_0.1.14      
## [49] grid_4.0.3           blob_1.2.1           parallel_4.0.3      
## [52] crayon_1.3.4         lattice_0.20-41      haven_2.3.1         
## [55] splines_4.0.3        hms_0.5.3            tmvnsim_1.0-2       
## [58] knitr_1.30           pillar_1.4.6         stats4_4.0.3        
## [61] reshape2_1.4.4       codetools_0.2-16     reprex_0.3.0        
## [64] glue_1.4.2           evaluate_0.14        data.table_1.13.2   
## [67] modelr_0.1.8         vctrs_0.3.4          foreach_1.5.1       
## [70] cellranger_1.1.0     gtable_0.3.0         assertthat_0.2.1    
## [73] xfun_0.18            gower_0.2.2          prodlim_2019.11.13  
## [76] broom_0.7.2          e1071_1.7-4          class_7.3-17        
## [79] survival_3.2-7       viridisLite_0.3.0    timeDate_3043.102   
## [82] iterators_1.0.13     lava_1.6.8           ellipsis_0.3.1      
## [85] caret_6.0-86         ipred_0.9-9